In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import math
In [2]:
data = pd.read_csv('HCEPDB/HCEPDB_moldata.csv')
In [3]:
data.head(10)
Out[3]:
id
SMILES_str
stoich_str
mass
pce
voc
jsc
e_homo_alpha
e_gap_alpha
e_lumo_alpha
tmp_smiles_str
0
655365
C1C=CC=C1c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
C18H9N3OSSe
394.3151
5.161953
0.867601
91.567575
-5.467601
2.022944
-3.444656
C1=CC=C(C1)c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
1
1245190
C1C=CC=C1c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH2]...
C22H15NSeSi
400.4135
5.261398
0.504824
160.401549
-5.104824
1.630750
-3.474074
C1=CC=C(C1)c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH...
2
21847
C1C=c2ccc3c4c[nH]cc4c4c5[SiH2]C(=Cc5oc4c3c2=C1...
C24H17NOSi
363.4903
0.000000
0.000000
197.474780
-4.539526
1.462158
-3.077368
C1=CC=C(C1)C1=Cc2oc3c(c2[SiH2]1)c1c[nH]cc1c1cc...
3
65553
[SiH2]1C=CC2=C1C=C([SiH2]2)C1=Cc2[se]ccc2[SiH2]1
C12H12SeSi3
319.4448
6.138294
0.630274
149.887545
-5.230274
1.682250
-3.548025
C1=CC2=C([SiH2]1)C=C([SiH2]2)C1=Cc2[se]ccc2[Si...
4
720918
C1C=c2c3ccsc3c3[se]c4cc(oc4c3c2=C1)C1=CC=CC1
C20H12OSSe
379.3398
1.991366
0.242119
126.581347
-4.842119
1.809439
-3.032680
C1=CC=C(C1)c1cc2[se]c3c4sccc4c4=CCC=c4c3c2o1
5
1310744
C1C=CC=C1c1cc2[se]c3c(c4nsnc4c4ccncc34)c2c2ccc...
C24H13N3SSe
454.4137
5.605135
0.951911
90.622776
-5.551911
2.029717
-3.522194
C1=CC=C(C1)c1cc2[se]c3c(c4nsnc4c4ccncc34)c2c2c...
6
196637
C1C=CC=C1c1cc2[se]c3cc4ccsc4cc3c2[se]1
C17H10SSe2
404.2520
2.644436
0.587932
69.223461
-5.187932
2.201106
-2.986827
C1=CC=C(C1)c1cc2[se]c3cc4ccsc4cc3c2[se]1
7
262174
C1C=CC=C1c1cc2[se]c3c4occc4c4cscc4c3c2[se]1
C19H10OSSe2
444.2730
2.523057
0.397670
97.645325
-4.997670
1.982122
-3.015548
C1=CC=C(C1)c1cc2[se]c3c4occc4c4cscc4c3c2[se]1
8
393249
C1C=CC=C1c1cc2[se]c3cc4cccnc4cc3c2c2ccccc12
C24H15NSe
396.3495
3.115895
0.869140
55.174815
-5.469140
2.331815
-3.137325
C1=CC=C(C1)c1cc2[se]c3cc4cccnc4cc3c2c2ccccc12
9
35
C1C2=C([SiH2]C=C2)C=C1c1cc2occc2c2cscc12
C17H12OSSi
292.4328
2.743214
0.387106
109.062905
-4.987106
1.909966
-3.077141
C1=CC2=C([SiH2]1)C=C(C2)c1cc2occc2c2cscc12
In [4]:
data.tail(10)
Out[4]:
id
SMILES_str
stoich_str
mass
pce
voc
jsc
e_homo_alpha
e_gap_alpha
e_lumo_alpha
tmp_smiles_str
2322839
1703911
C1cc2c(ccc(-c3cccnc3)c2c1)-c1sc(-c2scc3cc[SiH2...
C26H19NS2Si2
465.7471
4.881051
0.657693
114.218791
-5.257693
1.876279
-3.381414
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
2322840
1814506
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(c3[SiH2]ccc23)-...
C23H16N2S3Si2
472.7634
3.353182
0.461167
111.904241
-5.061167
1.892000
-3.169167
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(c3[SiH2]ccc23)-...
2322841
2559314
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(c3[SiH2]ccc23)-...
C23H15NOS3Si2
473.7475
4.263377
0.688326
95.325067
-5.288326
1.998713
-3.289613
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(c3[SiH2]ccc23)-...
2322842
2351086
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
C24H16N2S3Si2
484.7744
6.662663
0.850060
120.627407
-5.450060
1.839686
-3.610374
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
2322843
1712111
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c...
C24H12OS6Si
536.8398
2.951709
0.279912
162.292795
-4.879912
1.615145
-3.264767
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c...
2322844
2543603
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1cnc(...
C22H14N4S3Si2
486.7506
0.000000
0.000000
0.000000
-5.632512
1.454082
-4.178430
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1cnc(...
2322845
2304057
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
C22H14N4S3Si2
486.7506
9.335485
1.120744
128.197094
-5.720744
1.798600
-3.922144
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
2322846
2007035
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
C26H18S3Si2
482.7982
2.498209
0.834995
46.046052
-5.434995
2.433160
-3.001835
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
2322847
1961981
C1ccc2c1c(sc2-c1scc2cc[SiH2]c12)-c1ccc(cc1)-c1...
C25H16S3SeSi
519.6454
2.679067
0.659243
62.544032
-5.259243
2.258468
-3.000775
c1sc(c2[SiH2]ccc12)-c1sc(c2Cccc12)-c1ccc(cc1)-...
2322848
2754558
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c...
C24H13NOS5Si
519.7887
1.272400
0.102802
190.489616
-4.702802
1.490950
-3.211851
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c...
In [5]:
data.shape
Out[5]:
(2322849, 11)
In [6]:
data.columns
Out[6]:
Index(['id', 'SMILES_str', 'stoich_str', 'mass', 'pce', 'voc', 'jsc',
'e_homo_alpha', 'e_gap_alpha', 'e_lumo_alpha', 'tmp_smiles_str'],
dtype='object')
In [7]:
data.set_index('id').head(5)
Out[7]:
SMILES_str
stoich_str
mass
pce
voc
jsc
e_homo_alpha
e_gap_alpha
e_lumo_alpha
tmp_smiles_str
id
655365
C1C=CC=C1c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
C18H9N3OSSe
394.3151
5.161953
0.867601
91.567575
-5.467601
2.022944
-3.444656
C1=CC=C(C1)c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
1245190
C1C=CC=C1c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH2]...
C22H15NSeSi
400.4135
5.261398
0.504824
160.401549
-5.104824
1.630750
-3.474074
C1=CC=C(C1)c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH...
21847
C1C=c2ccc3c4c[nH]cc4c4c5[SiH2]C(=Cc5oc4c3c2=C1...
C24H17NOSi
363.4903
0.000000
0.000000
197.474780
-4.539526
1.462158
-3.077368
C1=CC=C(C1)C1=Cc2oc3c(c2[SiH2]1)c1c[nH]cc1c1cc...
65553
[SiH2]1C=CC2=C1C=C([SiH2]2)C1=Cc2[se]ccc2[SiH2]1
C12H12SeSi3
319.4448
6.138294
0.630274
149.887545
-5.230274
1.682250
-3.548025
C1=CC2=C([SiH2]1)C=C([SiH2]2)C1=Cc2[se]ccc2[Si...
720918
C1C=c2c3ccsc3c3[se]c4cc(oc4c3c2=C1)C1=CC=CC1
C20H12OSSe
379.3398
1.991366
0.242119
126.581347
-4.842119
1.809439
-3.032680
C1=CC=C(C1)c1cc2[se]c3c4sccc4c4=CCC=c4c3c2o1
In [8]:
data.dtypes
Out[8]:
id int64
SMILES_str object
stoich_str object
mass float64
pce float64
voc float64
jsc float64
e_homo_alpha float64
e_gap_alpha float64
e_lumo_alpha float64
tmp_smiles_str object
dtype: object
In [9]:
data['(xi-x)^2'] = (data['mass'] - data['mass'].mean())**2
data.head()
Out[9]:
id
SMILES_str
stoich_str
mass
pce
voc
jsc
e_homo_alpha
e_gap_alpha
e_lumo_alpha
tmp_smiles_str
(xi-x)^2
0
655365
C1C=CC=C1c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
C18H9N3OSSe
394.3151
5.161953
0.867601
91.567575
-5.467601
2.022944
-3.444656
C1=CC=C(C1)c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
451.517873
1
1245190
C1C=CC=C1c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH2]...
C22H15NSeSi
400.4135
5.261398
0.504824
160.401549
-5.104824
1.630750
-3.474074
C1=CC=C(C1)c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH...
229.539163
2
21847
C1C=c2ccc3c4c[nH]cc4c4c5[SiH2]C(=Cc5oc4c3c2=C1...
C24H17NOSi
363.4903
0.000000
0.000000
197.474780
-4.539526
1.462158
-3.077368
C1=CC=C(C1)C1=Cc2oc3c(c2[SiH2]1)c1c[nH]cc1c1cc...
2711.675432
3
65553
[SiH2]1C=CC2=C1C=C([SiH2]2)C1=Cc2[se]ccc2[SiH2]1
C12H12SeSi3
319.4448
6.138294
0.630274
149.887545
-5.230274
1.682250
-3.548025
C1=CC2=C([SiH2]1)C=C([SiH2]2)C1=Cc2[se]ccc2[Si...
9238.910207
4
720918
C1C=c2c3ccsc3c3[se]c4cc(oc4c3c2=C1)C1=CC=CC1
C20H12OSSe
379.3398
1.991366
0.242119
126.581347
-4.842119
1.809439
-3.032680
C1=CC=C(C1)c1cc2[se]c3c4sccc4c4=CCC=c4c3c2o1
1312.196283
In [39]:
SD = math.sqrt(sum(data['(xi-x)^2'])/data.shape[0])
M = data['mass'].mean()
print('SD = ',SD,', mean = ',M)
SD = 59.853157780691824 , mean = 415.564049928
In [55]:
data['mass_group'] = pd.cut(data['mass'],bins=[min(data['mass']),M-3*SD,M-2*SD,M-SD,M+SD,M+2*SD,M+3*SD,max(data['mass'])],labels=["<(-3SD)","-3SD~-2SD","-2SD~-SD","-SD~+SD","+SD~+2SD","+2SD~+3SD",">(+3SD)"])
In [56]:
data
Out[56]:
id
SMILES_str
stoich_str
mass
pce
voc
jsc
e_homo_alpha
e_gap_alpha
e_lumo_alpha
tmp_smiles_str
(xi-x)^2
mass_group
0
655365
C1C=CC=C1c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
C18H9N3OSSe
394.3151
5.161953
0.867601
91.567575
-5.467601
2.022944
-3.444656
C1=CC=C(C1)c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
451.517873
-SD~+SD
1
1245190
C1C=CC=C1c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH2]...
C22H15NSeSi
400.4135
5.261398
0.504824
160.401549
-5.104824
1.630750
-3.474074
C1=CC=C(C1)c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH...
229.539163
-SD~+SD
2
21847
C1C=c2ccc3c4c[nH]cc4c4c5[SiH2]C(=Cc5oc4c3c2=C1...
C24H17NOSi
363.4903
0.000000
0.000000
197.474780
-4.539526
1.462158
-3.077368
C1=CC=C(C1)C1=Cc2oc3c(c2[SiH2]1)c1c[nH]cc1c1cc...
2711.675432
-SD~+SD
3
65553
[SiH2]1C=CC2=C1C=C([SiH2]2)C1=Cc2[se]ccc2[SiH2]1
C12H12SeSi3
319.4448
6.138294
0.630274
149.887545
-5.230274
1.682250
-3.548025
C1=CC2=C([SiH2]1)C=C([SiH2]2)C1=Cc2[se]ccc2[Si...
9238.910207
-2SD~-SD
4
720918
C1C=c2c3ccsc3c3[se]c4cc(oc4c3c2=C1)C1=CC=CC1
C20H12OSSe
379.3398
1.991366
0.242119
126.581347
-4.842119
1.809439
-3.032680
C1=CC=C(C1)c1cc2[se]c3c4sccc4c4=CCC=c4c3c2o1
1312.196283
-SD~+SD
5
1310744
C1C=CC=C1c1cc2[se]c3c(c4nsnc4c4ccncc34)c2c2ccc...
C24H13N3SSe
454.4137
5.605135
0.951911
90.622776
-5.551911
2.029717
-3.522194
C1=CC=C(C1)c1cc2[se]c3c(c4nsnc4c4ccncc34)c2c2c...
1509.295311
-SD~+SD
6
196637
C1C=CC=C1c1cc2[se]c3cc4ccsc4cc3c2[se]1
C17H10SSe2
404.2520
2.644436
0.587932
69.223461
-5.187932
2.201106
-2.986827
C1=CC=C(C1)c1cc2[se]c3cc4ccsc4cc3c2[se]1
127.962474
-SD~+SD
7
262174
C1C=CC=C1c1cc2[se]c3c4occc4c4cscc4c3c2[se]1
C19H10OSSe2
444.2730
2.523057
0.397670
97.645325
-4.997670
1.982122
-3.015548
C1=CC=C(C1)c1cc2[se]c3c4occc4c4cscc4c3c2[se]1
824.203814
-SD~+SD
8
393249
C1C=CC=C1c1cc2[se]c3cc4cccnc4cc3c2c2ccccc12
C24H15NSe
396.3495
3.115895
0.869140
55.174815
-5.469140
2.331815
-3.137325
C1=CC=C(C1)c1cc2[se]c3cc4cccnc4cc3c2c2ccccc12
369.198929
-SD~+SD
9
35
C1C2=C([SiH2]C=C2)C=C1c1cc2occc2c2cscc12
C17H12OSSi
292.4328
2.743214
0.387106
109.062905
-4.987106
1.909966
-3.077141
C1=CC2=C([SiH2]1)C=C(C2)c1cc2occc2c2cscc12
15161.304709
-3SD~-2SD
10
1048612
C1C=CC=C1C1=Cc2sc3cc4C=C[SiH2]c4cc3c2C1
C18H14SSi
290.4606
2.408411
0.431315
85.937708
-5.031315
2.065850
-2.965465
C1=CC=C(C1)C1=Cc2sc3cc4C=C[SiH2]c4cc3c2C1
15650.873184
-3SD~-2SD
11
917542
C1C=c2ccc3[se]c4c5[se]c(cc5[se]c4c3c2=C1)C1=CC...
C20H12Se3
489.1948
2.843278
0.302591
144.614366
-4.902591
1.708198
-3.194393
C1=CC=C(C1)c1cc2[se]c3c([se]c4ccc5=CCC=c5c34)c...
5421.487356
+SD~+2SD
12
1441831
C1C=CC=C1C1=Cc2ncc3c4[se]ccc4cnc3c2C1
C18H12N2Se
335.2668
2.687240
0.675497
61.225278
-5.275497
2.270953
-3.004544
C1=CC=C(C1)C1=Cc2ncc3c4[se]ccc4cnc3c2C1
6447.648346
-2SD~-SD
13
1376296
C1C=CC=C1C1=Cc2c(C1)c1[se]c3ccc4cscc4c3c1c1=C[...
C24H16SSeSi
443.5024
2.844637
0.189206
231.387394
-4.789206
1.312334
-3.476872
C1=CC=C(C1)C1=Cc2c(C1)c1[se]c3ccc4cscc4c3c1c1=...
780.551405
-SD~+SD
14
1638442
C1C=c2ccc3cnc4c5[SiH2]C(=Cc5c5nsnc5c4c3c2=C1)C...
C23H15N3SSi
393.5445
6.462512
0.602405
165.105179
-5.202405
1.603165
-3.599240
C1=CC=C(C1)C1=Cc2c([SiH2]1)c1ncc3ccc4=CCC=c4c3...
484.860579
-SD~+SD
15
98350
C1C=CC=C1C1=Cc2ccc3c4CC=Cc4c4cscc4c3c2[SiH2]1
C22H16SSi
340.5204
2.631463
0.410851
98.573546
-5.010851
1.975707
-3.035144
C1=CC=C(C1)C1=Cc2ccc3c4CC=Cc4c4cscc4c3c2[SiH2]1
5631.549395
-2SD~-SD
16
2162747
C1C=CC=C1C1=Cc2c([SiH2]1)c1c3c[nH]cc3c3ccc4=C[...
C27H19NOSi2
429.6251
2.039158
0.140744
222.981280
-4.740744
1.361137
-3.379607
C1=CC=C(C1)C1=Cc2c([SiH2]1)c1c3c[nH]cc3c3ccc4=...
197.713129
-SD~+SD
17
557119
C1C=c2c3C=C(Cc3c3occc3c2=C1)C1=CC=CC1
C19H14O
258.3186
0.237205
0.024962
146.246545
-4.624962
1.700415
-2.924547
C1=CC=C(C1)C1=Cc2c(C1)c1occc1c1=CCC=c21
24726.131523
-3SD~-2SD
18
753728
C1C=CC=C1C1=Cc2c([SiH2]1)c1cc3ncccc3cc1c1c[nH]...
C22H16N2Si
336.4684
3.103831
0.409504
116.650708
-5.009504
1.863416
-3.146088
C1=CC=C(C1)C1=Cc2c([SiH2]1)c1cc3ncccc3cc1c1c[n...
6256.121838
-2SD~-SD
19
819265
C1C=CC=C1C1=Cc2c([SiH2]1)c1c(c3cscc23)c2[se]cc...
C23H16SSeSi2
459.5774
5.385253
0.368606
224.848916
-4.968606
1.352309
-3.616298
C1=CC=C(C1)C1=Cc2c([SiH2]1)c1c(c3cscc23)c2[se]...
1937.174985
-SD~+SD
20
1278019
C1C=CC=C1C1=Cc2c([SiH2]1)c1c(c3[SiH2]C=Cc3c3=C...
C23H18OSi3
394.6522
5.489489
0.301242
280.455932
-4.901242
1.135619
-3.765623
C1=CC=C(C1)C1=Cc2c([SiH2]1)c1c(c3[SiH2]C=Cc3c3...
437.305467
-SD~+SD
21
2096063
C1C=CC=C1c1cc2[se]c3c(c2c2cscc12)c1ccccc1c1ccc...
C27H14N2S2Se
509.5136
6.204093
0.570055
167.497914
-5.170055
1.593078
-3.576977
C1=CC=C(C1)c1cc2[se]c3c(c2c2cscc12)c1ccccc1c1c...
8826.517959
+SD~+2SD
22
2752585
C1C=CC=C1C1=Cc2c(C1)c1c(c3c[nH]cc23)c2c3c[nH]c...
C28H20N2Si
412.5660
0.000000
0.000000
198.749914
-4.499447
1.457208
-3.042239
C1=CC=C(C1)C1=Cc2c(C1)c1c(c3c[nH]cc23)c2c3c[nH...
8.988303
-SD~+SD
23
1572945
C1C=CC=C1C1=Cc2[se]c3c4sccc4c4ccccc4c3c2C1
C22H14SSe
389.3786
2.167252
0.330623
100.884304
-4.930623
1.961253
-2.969370
C1=CC=C(C1)C1=Cc2[se]c3c4sccc4c4ccccc4c3c2C1
685.677788
-SD~+SD
24
2359381
C1C=CC=C1C1=Cc2c(C1)c1c3cscc3c3ccc4nsnc4c3c1c1...
C26H14N2OS2
434.5416
4.112982
0.299549
211.318161
-4.899549
1.409229
-3.490319
C1=CC=C(C1)C1=Cc2c(C1)c1c3cscc3c3ccc4nsnc4c3c1...
360.147407
-SD~+SD
25
1540183
C1C=CC=C1c1cc2[se]c3c([se]c4ccc5cscc5c34)c2cn1
C20H11NSSe2
455.2999
3.212565
0.683568
72.329945
-5.283568
2.174712
-3.108856
C1=CC=C(C1)c1cc2[se]c3c([se]c4ccc5cscc5c34)c2cn1
1578.937781
-SD~+SD
26
1638500
C1C=CC=C1c1cc2[se]c3ccc4ccccc4c3c2c2cocc12
C23H14OSe
385.3226
3.088844
0.482262
98.573546
-5.082262
1.977235
-3.105027
C1=CC=C(C1)c1cc2[se]c3ccc4ccccc4c3c2c2cocc12
914.545294
-SD~+SD
27
2621542
C1C=c2c3ccccc3c3c4ccccc4c4C=C(Cc4c3c2=C1)C1=CC...
C29H20
368.4770
2.552886
0.341115
115.180406
-4.941115
1.872759
-3.068355
C1=CC=C(C1)C1=Cc2c(C1)c1c(c3ccccc23)c2ccccc2c2...
2217.190271
-SD~+SD
28
98411
C1C=CC=C1c1cc2[se]c3cc4cccnc4cc3c2c2cscc12
C22H13NSSe
402.3777
4.247356
0.653960
99.957476
-5.253960
1.967245
-3.286715
C1=CC=C(C1)c1cc2[se]c3cc4cccnc4cc3c2c2cscc12
173.879824
-SD~+SD
29
524398
C1C=c2c3C=C([SiH2]c3c3ncc4ccc5nsnc5c4c3c2=C1)C...
C23H15N3SSi
393.5445
5.860942
0.497394
181.348711
-5.097394
1.533947
-3.563447
C1=CC=C(C1)C1=Cc2c([SiH2]1)c1ncc3ccc4nsnc4c3c1...
484.860579
-SD~+SD
...
...
...
...
...
...
...
...
...
...
...
...
...
...
2322819
2705444
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
C25H17NS3Si2
483.7863
2.976815
0.892533
51.330433
-5.492533
2.373489
-3.119045
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
4654.275405
+SD~+2SD
2322820
2925216
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4occc34)c...
C24H12O2S5Si
520.7728
3.687312
0.323482
175.431612
-4.923482
1.558371
-3.365111
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4occc34)c...
11068.881092
+SD~+2SD
2322821
2742210
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccoc34)c...
C24H12O2S5Si
520.7728
3.036407
0.280599
166.541080
-4.880599
1.596420
-3.284179
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccoc34)c...
11068.881092
+SD~+2SD
2322822
3092419
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
C23H15N3S3Si2
485.7625
5.766431
1.000112
88.737230
-5.600112
2.045365
-3.554748
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
4927.822392
+SD~+2SD
2322823
1253317
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
C23H17NS2Si2
427.6983
2.569183
1.021842
38.695335
-5.621842
2.523390
-3.098452
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
147.240025
-SD~+SD
2322824
1841096
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
C25H17NOS2Si2
467.7193
3.651471
0.838712
67.004278
-5.438712
2.220525
-3.218187
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
2720.170110
-SD~+SD
2322825
2770889
C1ccc2c1c(sc2-c1scc2cc[SiH2]c12)-c1ccc(-c2cccc...
C26H17NS3Si
467.7113
3.294399
0.667854
75.917576
-5.267854
2.143414
-3.124440
c1sc(c2[SiH2]ccc12)-c1sc(c2Cccc12)-c1ccc(-c2cc...
2719.335690
-SD~+SD
2322826
1816522
C1ccc2c1c(sc2-c1scc2cc[SiH2]c12)-c1sc(-c2ccccc...
C25H16S4Si
472.7514
3.297434
0.473489
107.179926
-5.073489
1.921142
-3.152347
c1sc(c2[SiH2]ccc12)-c1sc(c2Cccc12)-c1sc(-c2ccc...
3270.393008
-SD~+SD
2322827
1810382
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
C25H17NOS2Si2
467.7193
3.581623
0.762095
72.329945
-5.362095
2.171842
-3.190253
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
2720.170110
-SD~+SD
2322828
1648591
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccoc34)c...
C24H12O3S4Si
504.7058
2.780562
0.264955
161.513282
-4.864955
1.618879
-3.246076
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccoc34)c...
7946.251606
+SD~+2SD
2322829
2705360
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccoc34)c...
C24H13NO2S4Si
503.7217
1.063303
0.087194
187.679800
-4.687194
1.502985
-3.184210
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccoc34)c...
7771.771266
+SD~+2SD
2322830
2349009
C1ccc2csc(c12)-c1ccc(cn1)-c1sc(-c2scc3cc[SiH2]...
C24H17NS3Si2
471.7753
2.802896
0.911719
47.314404
-5.511719
2.421182
-3.090538
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
3159.704635
-SD~+SD
2322831
3091107
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c...
C24H14OS5Si2
534.8756
3.770352
0.412894
140.537136
-5.012894
1.732059
-3.280835
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c...
14235.245980
+SD~+2SD
2322832
8152
[SiH2]1ccc2csc(c12)-c1sc(-c2scc3cc[se]c23)c2[s...
C18H10S3Se2Si
508.4810
2.887419
0.549016
80.941730
-5.149016
2.101911
-3.047105
c1sc(c2[SiH2]ccc12)-c1sc(-c2scc3cc[se]c23)c2[s...
8633.559611
+SD~+2SD
2322833
1781722
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
C23H16N2S3Si2
472.7634
2.814019
0.556938
77.762059
-5.156938
2.127099
-3.029839
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
3271.765649
-SD~+SD
2322834
2470223
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4sccc34)c...
C24H13NS6Si
535.8557
2.445740
0.207560
181.348711
-4.807560
1.533100
-3.274460
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4sccc34)c...
14470.081077
+2SD~+3SD
2322835
2469856
C1ccc2c1c(sc2-c1sc(-c2scc3cc[SiH2]c23)c2ccoc12...
C25H15NOS4Si
501.7495
2.143418
0.227460
145.026911
-4.827460
1.707258
-3.120202
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(c3Cccc23)-c2scc...
7427.931804
+SD~+2SD
2322836
1912803
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccoc34)c...
C24H12O3S4Si
504.7058
2.656897
0.274521
148.952385
-4.874521
1.686757
-3.187764
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccoc34)c...
7946.251606
+SD~+2SD
2322837
1216485
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1cccc...
C18H12N2S3Si2
408.6768
7.594213
0.993521
117.639554
-5.593521
1.857476
-3.736045
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1cccc...
47.434212
-SD~+SD
2322838
2619366
C1cc2c(ccc(-c3ccccc3)c2c1)-c1sc(-c2scc3cc[SiH2...
C28H20S2Si
448.6840
3.743223
0.466049
123.612430
-5.066049
1.824004
-3.242045
c1sc(c2[SiH2]ccc12)-c1sc(c2Cccc12)-c1ccc(-c2cc...
1096.931093
-SD~+SD
2322839
1703911
C1cc2c(ccc(-c3cccnc3)c2c1)-c1sc(-c2scc3cc[SiH2...
C26H19NS2Si2
465.7471
4.881051
0.657693
114.218791
-5.257693
1.876279
-3.381414
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
2518.338514
-SD~+SD
2322840
1814506
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(c3[SiH2]ccc23)-...
C23H16N2S3Si2
472.7634
3.353182
0.461167
111.904241
-5.061167
1.892000
-3.169167
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(c3[SiH2]ccc23)-...
3271.765649
-SD~+SD
2322841
2559314
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(c3[SiH2]ccc23)-...
C23H15NOS3Si2
473.7475
4.263377
0.688326
95.325067
-5.288326
1.998713
-3.289613
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(c3[SiH2]ccc23)-...
3385.313862
-SD~+SD
2322842
2351086
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
C24H16N2S3Si2
484.7744
6.662663
0.850060
120.627407
-5.450060
1.839686
-3.610374
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
4790.072557
+SD~+2SD
2322843
1712111
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c...
C24H12OS6Si
536.8398
2.951709
0.279912
162.292795
-4.879912
1.615145
-3.264767
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c...
14707.807555
+2SD~+3SD
2322844
2543603
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1cnc(...
C22H14N4S3Si2
486.7506
0.000000
0.000000
0.000000
-5.632512
1.454082
-4.178430
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1cnc(...
5067.524911
+SD~+2SD
2322845
2304057
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
C22H14N4S3Si2
486.7506
9.335485
1.120744
128.197094
-5.720744
1.798600
-3.922144
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
5067.524911
+SD~+2SD
2322846
2007035
[SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
C26H18S3Si2
482.7982
2.498209
0.834995
46.046052
-5.434995
2.433160
-3.001835
c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
4520.430936
+SD~+2SD
2322847
1961981
C1ccc2c1c(sc2-c1scc2cc[SiH2]c12)-c1ccc(cc1)-c1...
C25H16S3SeSi
519.6454
2.679067
0.659243
62.544032
-5.259243
2.258468
-3.000775
c1sc(c2[SiH2]ccc12)-c1sc(c2Cccc12)-c1ccc(cc1)-...
10832.927433
+SD~+2SD
2322848
2754558
[SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c...
C24H13NOS5Si
519.7887
1.272400
0.102802
190.489616
-4.702802
1.490950
-3.211851
c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c...
10862.777683
+SD~+2SD
2322849 rows × 13 columns
In [57]:
pd.value_counts(data['mass_group'])
Out[57]:
-SD~+SD 1603364
-2SD~-SD 323872
+SD~+2SD 290594
-3SD~-2SD 66373
+2SD~+3SD 34290
<(-3SD) 3144
>(+3SD) 1211
Name: mass_group, dtype: int64
In [58]:
1603364/data.shape[0]
Out[58]:
0.690257524272994
In [59]:
(1603364+323872+290594)/data.shape[0]
Out[59]:
0.9547887098989215
In [60]:
(1603364+323872+290594+66373+34290)/data.shape[0]
Out[60]:
0.998124716673361
In [61]:
data.groupby(['mass_group'])['voc'].describe()
Out[61]:
mass_group
<(-3SD) count 3.144000e+03
mean 6.219819e-01
std 4.210254e-01
min 0.000000e+00
25% 2.923413e-01
50% 5.870970e-01
75% 8.999475e-01
max 2.178957e+00
-3SD~-2SD count 6.637300e+04
mean 6.665345e-01
std 3.581035e-01
min 0.000000e+00
25% 4.086398e-01
50% 6.538720e-01
75% 9.035142e-01
max 2.135583e+00
-2SD~-SD count 3.238720e+05
mean 6.320939e-01
std 3.352676e-01
min 0.000000e+00
25% 3.982995e-01
50% 6.221896e-01
75% 8.524997e-01
max 2.103816e+00
-SD~+SD count 1.603364e+06
mean 5.465694e-01
std 2.958476e-01
min 0.000000e+00
25% 3.423544e-01
50% 5.364526e-01
75% 7.420294e-01
max 1.991945e+00
+SD~+2SD count 2.905940e+05
mean 4.830717e-01
std 2.730553e-01
min 0.000000e+00
25% 2.971842e-01
50% 4.845037e-01
75% 6.709221e-01
max 1.713406e+00
+2SD~+3SD count 3.429000e+04
mean 4.177375e-01
std 2.301736e-01
min 0.000000e+00
25% 2.761383e-01
50% 4.348518e-01
75% 5.779685e-01
max 1.274693e+00
>(+3SD) count 1.211000e+03
mean 3.944460e-01
std 1.581920e-01
min 0.000000e+00
25% 3.047716e-01
50% 3.960159e-01
75% 4.899024e-01
max 8.624765e-01
Name: voc, dtype: float64
In [62]:
data.groupby(['mass_group'])['voc'].mean().plot()
Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x106070208>
In [72]:
data.groupby(['mass_group'])['tmp_smiles_str'].count().plot().hist(20)
Out[72]:
(array([ 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]),
array([ 19.5, 19.6, 19.7, 19.8, 19.9, 20. , 20.1, 20.2, 20.3,
20.4, 20.5]),
<a list of 10 Patch objects>)
In [ ]:
Content source: danielfather7/teach_Python
Similar notebooks: